/***********************************************************************************************************************************************************

  Program: 	Tables Adjusted.sas
  Purpose: 	The following tables are created for the adjusted data, incorporating the affs and uffs 
			run of the regression model:
			(There is one table for the UFFS data and one for the AFFS for each replicate. The table names have suffixes for the data type (UFFS or AFFS)
			and replicate number)

  Tables:	1	The dollar coefficients for each disease coefficient and demographic
            1a  The adjusted dollar coefficients (AFFS)
			2	Risk Factors, which are the dollar coefficients divided by the average expenditures
            2a  The adjusted risk factors (AFFS)
			3	The predicted expenditures for each beneficiary  both sample benes and MA enrollees
                For the UFFS run, MA enrollees are not included in the table, since their data was not needed for the UFFS calculations.
			4	The actual expenditures (apy05commabad) for each beneficiary  just the sample benes
			5	The count of sample beneficiaries with the disease coefficient/demographic for each disease coefficient/demographic
				(i.e. number of enrollees with the disease coefficient/demographic set to 1)
			6	The average risk for each disease coefficient and demographic. Defined as the product of the estimated coefficient for disease 
				coefficient/demographic j and the number of sample beneficiaries with disease coefficient/demographic j 
				all divided by the number of sample beneficiaries. 
			7	The relative factor for each disease coefficient and demographic (RFj). Defined as the average risk for HCC j divided by the 
				average actual expenditures (average calculated over sample beneficiaries)
			8	The sum of the relative factors for each disease coefficient/demographic. Defined as the product of the relative factor for 
				disease coefficient/demographic j and the number of sample beneficiaries with disease coefficient/demographic j
			9	The risk scores for each sample beneficiary (based on relative factors). Defined as the sum of the products of the relative factor 
				for each disease coefficient/demographic j and the indicator of the disease coefficient/demographic j over all disease coefficients/demographics
            10	The risk scores for each sample beneficiary (based on risk factors). Defined as the sum of the products of the risk factors and the HCC indicators
            11  The X matrix: Bene identifier and perturbed HCC indicators for each beneficiary

            Each of the following tables is created for each replicate:

            IPARS: IPARS for each AFFS and MA enrollee
            AVG_IPARS: Average IPARS factor
            IPARS_NORMALIZED: Normalized IPARS for each FFS and MA enrollee
            Pre_Risk_Scores: MA enrollee risk scores using coefficients from the unadjusted regression
            Post_Risk_Scores: MA enrollee risk scores using coefficients from the adjusted regression, after IPARS normalization
            RS_Compare: MA enrollee pre- and post-risk scores with differences, relative differences, and absolute relative differences.

**********************************************************************************************************************************************************/

options mprint;
title "FFSA Analysis June 2019";

libname out "Z:\...\Table Output";

libname out1 "Z:\...\Table Output\Table_1";
libname out2 "Z:\...\Table Output\Table_2";
libname out3 "Z:\...\Table Output\Table_3";
libname out4 "Z:\...\Table Output\Table_4";
libname out5 "Z:\...\Table Output\Table_5";
libname out6 "Z:\...\Table Output\Table_6";
libname out7 "Z:\...\Table Output\Table_7";
libname out8 "Z:\...\Table Output\Table_8";
libname out9 "Z:\...\Table Output\Table_9";
libname out10 "Z:\...\Table Output\Table_10";

libname dat1 "Z:\...\FFSA Calibration";
libname dat "Z:\...\FFSA Calibration\Input Data";

/****************************************************************************************************************************

                                             MACROS

CREATETABLES_ADJ: Outputs the requested tables based on the perturbed-data regression.
CREATETABLES_AFFS: Outputs the requested tables based on the perturbed-data regression applied to the unadjusted (un-perturbed)
                   enrollee data.

****************************************************************************************************************************/

%macro createtables_adj(type,rep);

/***********************************************/
/*Table 1 UFFS*/
	
	proc transpose data=estcoeffs2 out=out1.table_1_&type._&rep. (rename=(col1=coefficient _name_=HCC_interaction _label_=label));
	run;


/*Table 2 UFFS*/

   proc means data=predictions2 noprint;
      var apy05commabad predicted;
      output out=mean_predictions2(drop=_type_ _freq_) mean=mean_expense mean_predicted;
      where apy05commabad~=.; /* Removes MA enrollees from the calculations. */
      title2 "Adjusted FFS Data, Average Expenses, and Predicted Expenses";
   run;

   proc sql;
	  create table out2.table_2_&type._&rep. as
	  select hcc_interaction,
	  		 label,
			 coefficient/mean_expense as risk_factor
	  from out1.table_1_&type._&rep., mean_predictions2;
   quit;

/*Table 3 UFFS*/

	data out3.table_3_&type._&rep.; 
	   set predictions2 (keep=hicno predicted);
	run;


/*Table 4 UFFS*/

	data out4.table_4_&type._&rep.;
	   set pophcc_p (keep=hicno apy05commabad);
	run;


/*Table 5 UFFS*/

	proc means data=pophcc_p sum noprint;
		var f0_34 -- f95_gt m0_34 -- m95_gt hcc1--hcc177 d_hcc5 d_hcc44 d_hcc51 d_hcc52 d_hcc107 dm_chf1 dm_cvd_70hccs
                         chf_copd copd_cvd_cad_70hccs rf_chf1 rf_chf_dm;
		output out=table_5_&type._&rep. sum= ;
	run;

	proc transpose data=table_5_&type._&rep. out=out5.table_5_&type._&rep. (rename=(col1=count _name_=HCC_interaction _label_=label));
	var f0_34 -- f95_gt m0_34 -- m95_gt hcc: d_hcc5 d_hcc44 d_hcc51 d_hcc52 d_hcc107 dm_chf1 dm_cvd_70hccs
                         chf_copd copd_cvd_cad_70hccs rf_chf1 rf_chf_dm;
	run;


/*Table 6 UFFS*/

	proc sql;
	create table out6.table_6_&type._&rep. as
	select a.hcc_interaction, a.label,
			coefficient*count/1441247 as avg_risk  /* 1441247 is number of sample benes */
	from out1.table_1_&type._&rep. as a full join out5.table_5_&type._&rep. as b
	on a.hcc_interaction=b.hcc_interaction;
	quit;

/*Table 7 UFFS*/

    proc sql;
	create table out7.table_7_&type._&rep. as
	select hcc_interaction,
			label,
			avg_risk/mean_expense as relative_factor
	from out6.table_6_&type._&rep., mean_predictions2;
	quit;


/*Table 8 UFFS*/

	proc sql;
	create table out8.table_8_&type._&rep. as
	select a.hcc_interaction, a.label,
			relative_factor*count as sum_rel_factor
	from out7.table_7_&type._&rep. as a full join out5.table_5_&type._&rep. as b
	on a.hcc_interaction=b.hcc_interaction;
	quit;


/*Table 9 UFFS */

   proc transpose data=out7.table_7_&type._&rep. out=table_7_prime(drop=_name_);
   run;

   data out9.table_9_&type._&rep.;
      set pophcc_p;
      if _n_=1 then set table_7_prime;
      array rf {105} col1--col105;
      array hcc {105} f0_34 -- f95_gt m0_34 -- m95_gt hcc1--hcc177 d_hcc5 d_hcc44 d_hcc51 d_hcc52 d_hcc107 dm_chf1 dm_cvd_70hccs
                     chf_copd copd_cvd_cad_70hccs rf_chf1 rf_chf_dm;
      risk_score=0;
      do i=1 to 105;
         risk_score=risk_score+rf [i]*hcc [i]; /* disease risk score for each bene based on 
	                                              relative factors */
      end;
      keep hicno risk_score;
   run;

/* Table 10 UFFS */

   proc transpose data=out2.table_2_&type._&rep. out=table_2_prime(drop=_name_);
   run;

   data out10.table_10_&type._&rep.;
      set pophcc_p;
      if _n_=1 then set table_2_prime;
      array nc {105} col1--col105;
      array hcc {105} f0_34 -- f95_gt m0_34 -- m95_gt hcc1--hcc177 d_hcc5 d_hcc44 d_hcc51 d_hcc52 d_hcc107 dm_chf1 dm_cvd_70hccs
                         chf_copd copd_cvd_cad_70hccs rf_chf1 rf_chf_dm;
      risk_score=0;
      do i=1 to 105;
         risk_score=risk_score+nc [i]*hcc [i]; /* disease risk score for each bene based on 
	                                              risk factors. */
      end;
      keep hicno risk_score;
   run;

 /* Table 11 UFFS: X matrix with Bene identifier */

   data out.table_11_&type._&rep.;
      set pophcc_p;
	  keep hicno  f0_34 -- f95_gt m0_34 -- m95_gt hcc1--hcc177 d_hcc5 d_hcc44 d_hcc51 d_hcc52 d_hcc107 dm_chf1 dm_cvd_70hccs
                         chf_copd copd_cvd_cad_70hccs rf_chf1 rf_chf_dm;
   run;

%mend createtables_adj;

%macro createtables_affs(type,rep);

/***********************************************/
/*Table 1 AFFS*/
	
	proc transpose data=estcoeffs3 out=out1.table_1_&type._&rep. (rename=(col1=coefficient _name_=HCC_interaction _label_=label));
	run;


/*Table 2 AFFS*/

   proc means data=predictions3 noprint;
      var apy05commabad predicted;
      output out=mean_predictions3(drop=_type_ _freq_) mean=mean_expense mean_predicted;
      where dsname in ('UFFS');
   run;

   proc sql;
	  create table out2.table_2_&type._&rep. as
	  select hcc_interaction,
	  		 label,
			 coefficient/mean_expense as risk_factor
	  from out1.table_1_&type._&rep., mean_predictions3;
   quit;

/*Table 3 AFFS*/

	data out3.table_3_&type._&rep.; 
	   set predictions3 (keep=dsname hicno ma_enrollee_number predicted);
	   if dsname in ('AFFS','MAEN');
	   drop dsname;
	run;


/*Table 4 AFFS*/

	data out4.table_4_&type._&rep.;
	   set pophcc (keep=hicno apy05commabad); /* HICNO and Expenses for Unperturbed FFS data */
	run;


/*Table 5 AFFS*/

	proc means data=pophcc sum noprint; /* HCC Counts for original, unperturbed FFS benes */
		var f0_34 -- f95_gt m0_34 -- m95_gt hcc1--hcc177 d_hcc5 d_hcc44 d_hcc51 d_hcc52 d_hcc107 dm_chf1 dm_cvd_70hccs
                         chf_copd copd_cvd_cad_70hccs rf_chf1 rf_chf_dm;
		output out=table_5_&type._&rep. sum= ;
	run;

	proc transpose data=table_5_&type._&rep. out=out5.table_5_&type._&rep. (rename=(col1=count _name_=HCC_interaction _label_=label));
	   var f0_34 -- f95_gt m0_34 -- m95_gt hcc: d_hcc5 d_hcc44 d_hcc51 d_hcc52 d_hcc107 dm_chf1 dm_cvd_70hccs
                         chf_copd copd_cvd_cad_70hccs rf_chf1 rf_chf_dm;
	run;


/*Table 6 AFFS*/

	proc sql;
	create table out6.table_6_&type._&rep. as
	select a.hcc_interaction, a.label,
			coefficient*count/1441247 as avg_risk  /* 1441247 is number of sample benes */
	from out1.table_1_&type._&rep. as a full join out5.table_5_&type._&rep. as b
	on a.hcc_interaction=b.hcc_interaction;
	quit;

/*Table 7 AFFS*/

    proc sql;
	create table out7.table_7_&type._&rep. as
	select hcc_interaction,
			label,
			avg_risk/mean_expense as relative_factor
	from out6.table_6_&type._&rep., mean_predictions3;
	quit;


/*Table 8 AFFS*/

	proc sql;
	create table out8.table_8_&type._&rep. as
	select a.hcc_interaction, a.label,
			relative_factor*count as sum_rel_factor
	from out7.table_7_&type._&rep. as a full join out5.table_5_&type._&rep. as b
	on a.hcc_interaction=b.hcc_interaction;
	quit;


/*Table 9 AFFS */

   proc transpose data=out7.table_7_&type._&rep. out=table_7_prime(drop=_name_);
   run;

   data out9.table_9_&type._&rep.;
      set pophcc;
      if _n_=1 then set table_7_prime;
      array rf {105} col1--col105;
      array hcc {105} f0_34 -- f95_gt m0_34 -- m95_gt hcc1--hcc177 d_hcc5 d_hcc44 d_hcc51 d_hcc52 d_hcc107 dm_chf1 dm_cvd_70hccs
                     chf_copd copd_cvd_cad_70hccs rf_chf1 rf_chf_dm;
      risk_score=0;
      do i=1 to 105;
         risk_score=risk_score+rf [i]*hcc [i]; /* disease risk score for each bene based on 
	                                              relative factors */
      end;
      keep hicno risk_score;
   run;

/* Table 10 AFFS */

   proc transpose data=out2.table_2_&type._&rep. out=table_2_prime(drop=_name_);
   run;

   data out10.table_10_&type._&rep.;
      set pophcc;
      if _n_=1 then set table_2_prime;
      array nc {105} col1--col105;
      array hcc {105} f0_34 -- f95_gt m0_34 -- m95_gt hcc1--hcc177 d_hcc5 d_hcc44 d_hcc51 d_hcc52 d_hcc107 dm_chf1 dm_cvd_70hccs
                         chf_copd copd_cvd_cad_70hccs rf_chf1 rf_chf_dm;
      risk_score=0;
      do i=1 to 105;
         risk_score=risk_score+nc [i]*hcc [i]; /* disease risk score for each bene based on 
	                                              risk factors. */
      end;
      keep hicno risk_score;
   run;

 /* Table 11 AFFS: X matrix with Bene identifier */

   data out.table_11_&type._&rep.;
      set pophcc;
	  keep hicno  f0_34 -- f95_gt m0_34 -- m95_gt hcc1--hcc177 d_hcc5 d_hcc44 d_hcc51 d_hcc52 d_hcc107 dm_chf1 dm_cvd_70hccs
                         chf_copd copd_cvd_cad_70hccs rf_chf1 rf_chf_dm;
   run;

%mend createtables_affs;

/****************************************************************************************************************************

                                             MAIN PROGRAM

****************************************************************************************************************************/



/* Input the 2 data files:

   POPHCC: This is the orignal FFS data (1.4 million records). It has the original
           expenditures and HCC information. It includes the number of claims per
           HCC for each enrollee.
   MASTERMA: This is the random sample of 2 million MA enrollees.  */ 

data pophcc;
   set dat1.y5r1s15f;
run;

proc contents data=pophcc;
run;

data masterma(rename=(originally_disabled_female_aged=originallydisabled_female 
                      originally_disabled_male_aged=originallydisabled_male
                      dm_cvd=dm_cvd_70hccs copd_cvd_cad=copd_cvd_cad_70hccs));
   set dat1.samptb_y13_full1m dat1.samptb_y13_elig1m;
   length f0_34 -- f95_gt m0_34 -- m95_gt 4 hcc1 -- hcc177 3;
   apy05commabad=.;
   newexpenditures=.;
   ma_enrollee_number=_n_;
   keep apy05commabad newexpenditures f0_34 -- f95_gt m0_34 -- m95_gt hcc1 -- hcc177
        originally_disabled_female_aged 
        medicaid_female_aged medicaid_female_disabled  originally_disabled_male_aged 
        medicaid_male_Aged medicaid_male_disabled d_hcc5 d_hcc44 d_hcc51
        d_hcc52 d_hcc107 dm_chf1 dm_cvd chf_copd copd_cvd_cad rf_chf1 rf_chf_dm
        ma_enrollee_number;
run;

/* Examine the contents of the files */

proc contents data=pophcc;
run;
proc contents data=masterma;
run;

proc print data=pophcc (obs=1);
   title2 "1 Observation from POPHCC";
run;
proc print data=masterma (obs=10);
   title2 "10 Observations from MASTERMA";
run;

/* Verify that APY05COMMABAD is missing for all MA enrolleees. */

proc freq data=masterma;
   tables apy05commabad/missing;
   title2 "Frequency Distribution of APY05COMMABAD for MA Enrollees";
run;

/* Step 1
   
   Regress expenses on the original FFS data (unperturbed expenditures and unRADVed HCCs).
   Calculate risk factors for the model (estimated coefficients divided by average expenditures)
   and risk scores for each beneficiary.

   In this step, we predict for the MA enrollees using the unadjusted risk factors from the regression on 
   unperturbed FFS HCCs. The predicted risk scores for MA enrollees are used in Step 5 below. Note
   the regression does not use the MA enrollee data in the fit, since apy05commabad is missing
   for all MA enrollees. */

proc means data=pophcc;
   var apy05commabad;
   title2 "Average Expenditures for Original FFS Data";
run;

data pophcc_ma;
   set pophcc masterma;
run;

proc reg data=pophcc_ma outest=estcoeffs1(drop=_type_ _model_ _rmse_ _depvar_ apy05commabad);
   model apy05commabad = f0_34 -- f95_gt m0_34 -- m95_gt hcc1 -- hcc177 
                         d_hcc5 d_hcc44 d_hcc51 d_hcc52 d_hcc107 dm_chf1 dm_cvd_70hccs
                         chf_copd copd_cvd_cad_70hccs rf_chf1 rf_chf_dm/noint; 
				                              /* HCCs, demos, interactions */
   output out=predictions1 (keep=hicno ma_enrollee_number apy05commabad predicted) p=predicted;
   title2 "Regression on Original FFS Data";
run;

proc means data=predictions1 noprint;
   var apy05commabad predicted;
   output out=mean_predictions1(drop=_type_ _freq_) mean=mean_expense mean_predicted;
   where apy05commabad~=.; /* Removes MA enrollees from the calculations. */
   title2 "Original FFS Data, Average Expenses, and Predicted Expenses";
run;

data predictions1;
   set predictions1;
   if _n_=1 then set mean_predictions1(keep=mean_expense);
   risk_score=predicted/mean_expense;
run;

/*  Verify that Average Risk Score equals 1. */

proc means data=predictions1;
   var risk_score;
   where apy05commabad~=.;
   title2 "Average Risk Score";
   title3 "Verify that Average Risk Score = 1";
run;

data estcoeffs1;
   set estcoeffs1;
   if _n_=1 then set mean_predictions1;
   drop mean_expense;
run;

%macro bootstrap;

   %do i=1 %to 50;

      /* Step 2: Randomly perturb the HCC data based on the supplied probabilities. */

      data pophcc_p;
         set pophcc;
         profile_change=0;
		 call streaminit(32*&i.);

         /* Hierarchical HCCs. */

         %macro h(hcc,cat,n);
            if hcc&hcc.=1 then do;
 	           r&hcc.=1*rand("unif");
		       category="&cat.";
		       if category='H' then do;
		          if r&hcc.<.461591**&n. then do; profile_change=1; hcc&hcc.=0; end;
	           end;
		       if category='N' then do;
		          if r&hcc.<.337938**&n. then do; profile_change=1; hcc&hcc.=0; end;
               end;
		       if category='L' then do;
		          if r&hcc.<.209114**&n. then do; profile_change=1; hcc&hcc.=0; end;
		       end;
            end;
         %mend h;

         %h(15,L,3.290);
         %h(16,L,3.290);
         %h(18,L,2.265);
         %h(19,L,6.199);
         %h(7,N,7.002);
         %h(8,N,13.396);
         %h(9,N,11.029);
         %h(10,H,7.043);
         %h(25,N,3.735);
         %h(26,N,5.214);
         %h(27,N,3.194);
         %h(51,N,2.249);
         %h(52,N,3.568);
         %h(54,H,8.956);
         %h(55,H,6.109);
         %h(67,N,4.540);
         %h(68,N,4.058);
         %h(69,N,2.430);
         %h(157,N,3.555);
         %h(78,N,2.010);
         %h(79,L,5.005);
         %h(81,N,3.580);
         %h(82,N,2.921);
         %h(83,H,2.417);
         %h(95,N,4.671);
         %h(96,H,4.105);
         %h(100,N,3.176);
         %h(101,N,2.645);
         %h(148,N,2.650);
         %h(149,L,4.844);
         %h(104,N,4.046);
         %h(105,N,3.442);
         %h(5,H,2.715);
         %h(112,N,2.332);
         %h(111,N,3.159);
         %h(107,N,6.524);
         %h(108,L,4.980);
         %h(130,L,4.480);
         %h(131,N,4.727);
         %h(132,N,2.307);
         %h(161,N,3.066);
         %h(177,N,3.402);
         %h(154,N,2.340);
         %h(75,N,2.367);
         %h(155,H,3.128);
         %h(17,N,2.638);
         %h(77,N,3.202);

         /* Non-hierarchical HCCs. */

         %macro nh(hcc,cat,n);
            if hcc&hcc.=1 then do;
 	           r&hcc.=1*rand("unif");
		       category="&cat.";
		       if category='H' then do;
		          if r&hcc.<.461591**&n. then do; profile_change=1; hcc&hcc.=0; end;
	 	       end;
		       if category='N' then do;
		          if r&hcc.<.337938**&n. then do; profile_change=1; hcc&hcc.=0; end;
		       end;
		       if category='L' then do;
		          if r&hcc.<.209114**&n. then do; profile_change=1; hcc&hcc.=0; end;
		       end;
	        end;
         %mend nh;

         %nh(1,N,11.446);
         %nh(2,N,4.059);
         %nh(21,N,2.476);
         %nh(31,N,3.486);
         %nh(32,L,3.744);
         %nh(33,N,3.654);
         %nh(37,N,4.684);
         %nh(38,L,4.922);
         %nh(44,H,6.106);
         %nh(45,H,3.792);
         %nh(70,N,3.279);
         %nh(71,H,2.646);
         %nh(72,N,7.276);
         %nh(73,N,5.240);
         %nh(74,L,4.709);
         %nh(80,N,6.073);
         %nh(92,H,7.000);
         %nh(119,N,2.639);
         %nh(150,N,2.885);
         %nh(158,H,7.536);
         %nh(164,N,2.611);
         %nh(174,N,10.143);
         %nh(176,L,2.747);

         /* Update disabled-disease interactions */

         if sum(originallydisabled_female,originallydisabled_male)=1 then do;
            d_hcc5=hcc5;
            d_hcc44=hcc44;
	        d_hcc51=hcc51;
	        d_hcc52=hcc52;
	        d_hcc107=hcc107;
         end;

         /* Update disease-disease interactions. See page B-4 of Medicare Advantage
	        Risk Adjustment Data Validation Calendar Year 2012, List of Hierarchical
	        and Non-Hierarchical CMS-HCCs Final. */

         dm_chf1=sum(hcc15,hcc16,hcc17,hcc18,hcc19)*hcc80;
         i_cvd=0;
         if sum(hcc95,hcc96,hcc100,hcc101)>0 then i_cvd=1;
         dm_cvd_70hccs=sum(hcc15,hcc16,hcc17,hcc18,hcc19)*i_cvd;
         chf_copd=hcc80*hcc108;
         copd_cvd_cad_70hccs=hcc108*i_cvd*sum(hcc81,hcc82,hcc83);
         rf_chf1=hcc131*hcc80;
         rf_chf_dm=hcc131*hcc80*sum(hcc15,hcc16,hcc17,hcc18,hcc19);
         if rf_chf_dm=1 then do;
            dm_chf1=0;
            rf_chf1=0;
         end;

         /* create new enrollee profile */

         profile=cats(of hcc1 -- hcc177);

         drop r1:r177 i_cvd;
      run;

      /* Step 3
   
         Regress expenses on the perturbed FFS data.
         Calculate risk factors for the model (estimated coefficients divided by average expenditures)
         and risk scores for each beneficiary.
 
      */

      proc means data=pophcc_p;
         var apy05commabad;
         title2 "Average Expenditures for Perturbed FFS Data";
      run;

      proc reg data=pophcc_p outest=estcoeffs2(drop=_type_ _model_ _rmse_ _depvar_ apy05commabad);
         model apy05commabad = f0_34 -- f95_gt m0_34 -- m95_gt hcc1 -- hcc177 
                               d_hcc5 d_hcc44 d_hcc51 d_hcc52 d_hcc107 dm_chf1 dm_cvd_70hccs
                               chf_copd copd_cvd_cad_70hccs rf_chf1 rf_chf_dm/noint; 
				                              /* HCCs, demos, interactions */
         output out=predictions2 (keep=hicno apy05commabad predicted) p=predicted;
         title2 "Regression on Perturbed FFS Data";
      run;

	  %createtables_adj(uffs,&i.); *call to macro to create UFFS tables. ;

      data predictions2;
         set predictions2;
         if _n_=1 then set mean_predictions2(keep=mean_expense);
         risk_score=predicted/mean_expense;
      run;

      proc print data=predictions2 (obs=10);
         title2 "Perturbed FFS Data with ID, Original Expenses, Predictions, and Risk Scores";
      run;

      /*  Verify that Average Risk Score equals 1. */

      proc means data=predictions2;
         var risk_score;
         title2 "Average Risk Score";
         title3 "Verify that Average Risk Score = 1";
      run;

      data estcoeffs2;
         set estcoeffs2;
         if _n_=1 then set mean_predictions2;
         drop mean_expense;
      run;

      proc print data=estcoeffs2;
         title2 "Estimated Regression Coefficients for Perturbed FFS Data Regression";
      run;

      /* Step 4

      Calculate IPARS for the AFFS data. This is done by predicting for the original data (POPHCC) 
      using the model from Step 3. Implement this by running the same PROC REG as in step 3 but
      with the AFFS data appended and not included in the PROC REG fit (expenses set to missing
      for AFFS data records).

      In this step, we predict for the MA enrollees using the adjusted risk factors from the regression on 
      perturbed FFS HCCs. The predicted risk scores for MA enrollees are used in Step 5 below. Note
      the regression does not use the MA enrollee data in the fit, since apy05commabad is missing
      for all MA enrollees. */
 
      data step4;
         set pophcc_p(in=a) pophcc(in=b) masterma(in=c);
         if b then apy05commabad=.;
         if a then dsname='UFFS';
         if b then dsname='AFFS';
         if c then dsname='MAEN';
      run;

      proc reg data=step4 outest=estcoeffs3(drop=_type_ _model_ _rmse_ _depvar_ apy05commabad);
         model apy05commabad = f0_34 -- f95_gt m0_34 -- m95_gt hcc1 -- hcc177 
                               d_hcc5 d_hcc44 d_hcc51 d_hcc52 d_hcc107 dm_chf1 dm_cvd_70hccs
                               chf_copd copd_cvd_cad_70hccs rf_chf1 rf_chf_dm/noint; 
		  		                              /* HCCs, demos, interactions */
         output out=predictions3 (keep=hicno ma_enrollee_number dsname apy05commabad predicted) p=predicted;
         title2 "Regression on Perturbed FFS Data, applied to AFFS Data";
      run;

	  %createtables_affs(affs,&i.); *call to macro to create AFFS tables. ;

      proc means data=predictions3;
         class dsname;
         var apy05commabad predicted;
         output out=mean_predictions3(drop=_type_ _freq_) mean=mean_expense mean_predicted;
         title2 "UFFS and AFFS and MA Enrollee Data, Average Expenses, and Predicted Expenses";
      run;

      data mean_predictions3;
         set mean_predictions3;
         if dsname='UFFS';
      run;

      data predictions31;
         set predictions3;
         if _n_=1 then set mean_predictions3(keep=mean_expense);
         if dsname in ('AFFS','MAEN');
         ipars=predicted/mean_expense;
      run;

      data out.table_ipars_&i.;
	     set predictions31;
	  run;

      proc means data=predictions31;
         var ipars;
         output out=avg_ipars(drop=_type_ _freq_) mean=avg_ipars;
         where dsname='AFFS';
         title2 "Verify that Average IPARS before Normalization is Greater than 1";
      run;

      data predictions31;
         set predictions31;
         if _n_=1 then set avg_ipars(keep=avg_ipars);
         ipars_normalized=ipars/avg_ipars;
      run;

      data out.avg_ipars_&i.;
	     set avg_ipars;
	  run;

	  /* Output Tables 1a and 2a */

	  data out1.table_1a_affs_&i.;
	     set out1.table_1_affs_&i.;
		 if _n_=1 then set avg_ipars(keep=avg_ipars);
		 adj_coefficient=coefficient/avg_ipars;
		 drop coefficient avg_ipars;
	  run;

	  data out2.table_2a_affs_&i.;
	     set out1.table_1a_affs_&i.;
         if _n_=1 then set mean_predictions3(keep=mean_expense);
         adj_risk_factor=adj_coefficient/mean_expense;
		 drop adj_coefficient mean_expense;
	  run;

      data out.table_ipars_normalized_&i.;
	     set predictions31;
	  run;

      proc means data=predictions31;
         var ipars_normalized;
         where dsname='AFFS';
         title2 "Verify that Average IPARS after Normalization is 1.0";
      run;

      /* Step 5

         Compare the predicted pre- and post-risk scores.
 
      */

      /* Create dataset with pre risk score predictions for MA enrollees */ 

      data pre_risk_scores;
         set predictions1;
         if apy05commabad=.;
      run;

	  data out.table_pre_risk_scores_&i.;
	     set pre_risk_scores;
	  run;

      /* Create dataset with post risk score predictions for MA enrollees */

      data post_risk_scores;
         set predictions31;
         if dsname='MAEN';
         drop ipars avg_ipars dsname;
      run;

      data out.table_post_risk_scores_&i.;
	     set post_risk_scores;
	  run;

      /* Compare the Pre and Post Risk Scores */

      proc sort data=pre_risk_scores;
         by ma_enrollee_number;
      run;

      proc sort data=post_risk_scores;
         by ma_enrollee_number;
      run;

      data rs_compare(rename=(risk_score=pre_risk_score ipars_normalized=post_risk_score));
         merge pre_risk_scores post_risk_scores;
         by ma_enrollee_number;
         drop hicno apy05commabad;
      run;

      proc print data=rs_compare(obs=100);
         title2 "Comparison of Pre and Post Risk Scores (100 Enrollees)";
      run;

      /* Calculate comparison statistics */

      data rs_compare;
         set rs_compare;
         difference=post_risk_score-pre_risk_score;
         rd=difference/pre_risk_score;
         ard=abs(rd);
      run;

      data out.table_rs_compare_&i.;
	     set rs_compare;
	  run;

      proc means data=rs_compare;
         var difference rd ard;
	     output out=out.table_rs_compare_&i.;
	     title2 "Comparison of Pre and Post Risk Scores";
      run;

      proc datasets;
	     delete pophcc_p estcoeffs2 predictions2 mean_predictions2 estcoeffs3 predictions3
		        mean_predictions3 predictions31 avg_ipars pre_risk_scores post_risk_scores
				rs_compare;
	  run;

   %end; /* end replicate r do loop */

%mend bootstrap;

%bootstrap;

quit;
